import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("./data/census_income_dataset.csv")
Our goal was to predict whether income exceeds $50000 per year based on census data in USA.
attributes = pd.read_csv("./attributes_census_income.csv")
attributes
| name | type | description | |
|---|---|---|---|
| 0 | age | integer | age of individual |
| 1 | workclass | string | Values: Private, Self-emp-not-inc, Self-emp-in... |
| 2 | fnlwgt | float | Final sampling weight. Inverse of sampling fra... |
| 3 | education | string | Values: Bachelors, Some-college, 11th, HS-grad... |
| 4 | education_num | integer | NaN |
| 5 | marital_status | string | Values: Married-civ-spouse, Divorced, Never-ma... |
| 6 | occupation | string | Values: Tech-support, Craft-repair, Other-serv... |
| 7 | relationship | string | Values: Wife, Own-child, Husband, Not-in-famil... |
| 8 | race | string | Values: White, Asian-Pac-Islander, Amer-Indian... |
| 9 | sex | string | Values: Female, Male |
| 10 | capital_gain | float | NaN |
| 11 | capital_loss | float | NaN |
| 12 | hours_per_week | float | working hours per week |
| 13 | native_country | string | Values: United-States, Cambodia, England, Puer... |
| 14 | income_level | string | Predictor class if individual earns greater or... |
Firstly, let's see the first rows of the table
data.head()
| age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income_level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516.0 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174.0 | 0.0 | 40.0 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311.0 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0.0 | 0.0 | 13.0 | United-States | <=50K |
| 2 | 38 | Private | 215646.0 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 3 | 53 | Private | 234721.0 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 4 | 28 | Private | 338409.0 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0.0 | 0.0 | 40.0 | Cuba | <=50K |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48842 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 48842 non-null int64 1 workclass 48842 non-null object 2 fnlwgt 48842 non-null float64 3 education 48842 non-null object 4 education_num 48842 non-null int64 5 marital_status 48842 non-null object 6 occupation 48842 non-null object 7 relationship 48842 non-null object 8 race 48842 non-null object 9 sex 48842 non-null object 10 capital_gain 48842 non-null float64 11 capital_loss 48842 non-null float64 12 hours_per_week 48842 non-null float64 13 native_country 48842 non-null object 14 income_level 48842 non-null object dtypes: float64(4), int64(2), object(9) memory usage: 5.6+ MB
We can see that we have no missing data marked 'NA. Let's look for concealed values.
Such values for categorical variables are "?" and numerical variables are "-100000".
data[data == "?"].count()
age 0 workclass 2799 fnlwgt 0 education 0 education_num 0 marital_status 0 occupation 2809 relationship 0 race 0 sex 0 capital_gain 0 capital_loss 0 hours_per_week 0 native_country 857 income_level 0 dtype: int64
data[(data == "?").any(axis = 1)].shape[0]
3620
data[data == "-100000"].count()
data[(data == "-100000").any(axis = 1)].shape[0]
0
Numerical variables have no missing data.
Let's change the cover-up values to NA for the duration of the analysis.
data = data.replace('?',np.nan).replace("-100000", np.nan)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48842 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 48842 non-null int64 1 workclass 46043 non-null object 2 fnlwgt 48842 non-null float64 3 education 48842 non-null object 4 education_num 48842 non-null int64 5 marital_status 48842 non-null object 6 occupation 46033 non-null object 7 relationship 48842 non-null object 8 race 48842 non-null object 9 sex 48842 non-null object 10 capital_gain 48842 non-null float64 11 capital_loss 48842 non-null float64 12 hours_per_week 48842 non-null float64 13 native_country 47985 non-null object 14 income_level 48842 non-null object dtypes: float64(4), int64(2), object(9) memory usage: 5.6+ MB
Teraz widać, że występują braki danych w kategoriach workclass, occupation oraz native_country.
The part which consists of rows with missing data:
def f(row):
if pd.isna(row['workclass']) or pd.isna(row['occupation']) or pd.isna(row['native_country']):
val = 1
else:
val = 0
return val
data['missing_value'] = data.apply(f, axis=1)
(data[data==1].count().missing_value)/data.shape[0]
0.07411653904426518
data = data.drop(columns=['missing_value'])
We consider that in a situation where rows with missing data represent 7.45%, we should not delete them. So we will replace them with the mod of the values in that column.
data['workclass'].value_counts()
Private 33906 Self-emp-not-inc 3862 Local-gov 3136 State-gov 1981 Self-emp-inc 1695 Federal-gov 1432 Without-pay 21 Never-worked 10 Name: workclass, dtype: int64
data['occupation'].value_counts()
Prof-specialty 6172 Craft-repair 6112 Exec-managerial 6086 Adm-clerical 5611 Sales 5504 Other-service 4923 Machine-op-inspct 3022 Transport-moving 2355 Handlers-cleaners 2072 Farming-fishing 1490 Tech-support 1446 Protective-serv 983 Priv-house-serv 242 Armed-Forces 15 Name: occupation, dtype: int64
data['native_country'].value_counts()
United-States 43832 Mexico 951 Philippines 295 Germany 206 Puerto-Rico 184 Canada 182 El-Salvador 155 India 151 Cuba 138 England 127 China 122 South 115 Jamaica 106 Italy 105 Dominican-Republic 103 Japan 92 Guatemala 88 Poland 87 Vietnam 86 Columbia 85 Haiti 75 Portugal 67 Taiwan 65 Iran 59 Greece 49 Nicaragua 49 Peru 46 Ecuador 45 France 38 Ireland 37 Thailand 30 Hong 30 Cambodia 28 Trinadad&Tobago 27 Outlying-US(Guam-USVI-etc) 23 Yugoslavia 23 Laos 23 Scotland 21 Honduras 20 Hungary 19 Holand-Netherlands 1 Name: native_country, dtype: int64
data.workclass.fillna("Private", inplace=True)
data.occupation.fillna("Prof-specialty", inplace=True)
data.native_country.fillna("United-States", inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48842 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 48842 non-null int64 1 workclass 48842 non-null object 2 fnlwgt 48842 non-null float64 3 education 48842 non-null object 4 education_num 48842 non-null int64 5 marital_status 48842 non-null object 6 occupation 48842 non-null object 7 relationship 48842 non-null object 8 race 48842 non-null object 9 sex 48842 non-null object 10 capital_gain 48842 non-null float64 11 capital_loss 48842 non-null float64 12 hours_per_week 48842 non-null float64 13 native_country 48842 non-null object 14 income_level 48842 non-null object dtypes: float64(4), int64(2), object(9) memory usage: 5.6+ MB
data_train, data_valid = train_test_split(data,train_size=0.7, stratify=data["income_level"], random_state=55)
data.shape
(48842, 15)
data_train.shape
(34189, 15)
data_valid.shape
(14653, 15)
data_all = data
data = data_train
data.describe()
| age | fnlwgt | education_num | capital_gain | capital_loss | hours_per_week | |
|---|---|---|---|---|---|---|
| count | 34189.000000 | 3.418900e+04 | 34189.000000 | 34189.000000 | 34189.000000 | 34189.000000 |
| mean | 38.616690 | 1.897100e+05 | 10.076925 | 1135.683553 | 86.497294 | 40.427681 |
| std | 13.701497 | 1.057805e+05 | 2.573352 | 7767.079194 | 400.265350 | 12.357519 |
| min | 17.000000 | 1.228500e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 28.000000 | 1.175250e+05 | 9.000000 | 0.000000 | 0.000000 | 40.000000 |
| 50% | 37.000000 | 1.781000e+05 | 10.000000 | 0.000000 | 0.000000 | 40.000000 |
| 75% | 48.000000 | 2.376200e+05 | 12.000000 | 0.000000 | 0.000000 | 45.000000 |
| max | 90.000000 | 1.484705e+06 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 |
We see that the variable "age" starts at age 17 and ends at age 90.
In addition, we see that at least 75% of the values of "capital_gain" and "capital_loss" are equal to 0.
discre = sns.color_palette("bwr", as_cmap=False)
contin = sns.color_palette("bwr", as_cmap=True)
sns.histplot(data['income_level'], color = discre[0])
plt.show()
We have a lead of people earning more than 50,000 USD.
fig, axs = plt.subplots(6, 2, figsize=(20, 30))
i = 0
for col in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
sns.histplot(data=data,bins = 20, x=col, ax = axs[i, 0], color = discre[1])
sns.boxplot(data=data, x=col, ax = axs[i,1], color = discre[3])
i += 1
plt.show()
fig, axs = plt.subplots(6, figsize=(20, 30))
i = 0
for col in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
sns.kdeplot(data = data, x= col, shade=True, hue=data['income_level'], ax= axs[i], palette = [discre[4], discre[0]]) #color=
i += 1
plt.show()
We see that "age" starts at age 17. Seeing the drop and sudden jump between 80 and 90 we can conclude that everyone over 90 was given age = 90.
We see that in the "education_number" variable, the largest number is 9 - HS-grad, those who have completed high school. A small percentage is made up of people in the groups who have not completed high school.
We see that both in capital_loss and capital_gain are mostly zeros with possible outliers.
In the variable "hours_per_week" there is a large peek among people working around 40h per week - from the fact that this is full-time.
From the graphs above, we concluded that there are no outliers for these variables because when we analyse the data as histograms/boxplots, we see that the values are not anomalies. The outliers fit into these distributions.
In order to be able to check the correlation of our most important variable with other variables, we map it to number.
binary = lambda x: 1 if x == ">50K" else 0
data['income_level'] = data['income_level'].apply(binary)
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot=True, vmin=-1, cmap = contin)
plt.show()
We see that the variable most strongly correlated with income_level is education_num. At the same level we see correlations with age, capital_gain, and hours_per_week. We see it is unlikely that we will consider the variable fnlwgt when building the model.
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(method ='spearman'), annot=True, vmin=-1, cmap = contin)
plt.show()
The correlation between the variables breaks down very similarly to the Pearson correlation.
categorical_columns = list(data.select_dtypes(exclude=[np.number]).columns)
categorical_columns = [c for c in categorical_columns if c != 'income_level']
r = 2
fig, axs = plt.subplots(8, figsize=(20,70))
i = 0
for col in categorical_columns:
sns.countplot(data=data, hue=data['income_level'], y=col, ax = axs[i], palette = [discre[4],discre[5]])
i += 1
plt.show()
fig, axs = plt.subplots(8, figsize=(20,60))
i = 0
for col in categorical_columns:
if i == 0:
data.groupby(col)["income_level"].value_counts(normalize=True).unstack('income_level').plot(kind = "bar", stacked=True, ax = axs[i], color = ["#FF6863", "pink"])
else:
data.groupby(col)["income_level"].value_counts(normalize=True).unstack('income_level').plot(kind = "bar", stacked=True, ax = axs[i], color = ["#FF6863", "pink"])
i += 1
plt.show()
We see that with higher education the percentage of people earning above 50,000 increases.
The level of earnings of people who have finished school before high school are at a very similar level.
By far the highest earning race is the white race.
About 40% of men earn above 50,000 while among women the percentage is about 10%.
We can see that the overwhelming majority of the frame when it comes to their home country are Americans.
We do all the steps analogous to the part above.
data = data_valid
data.describe()
| age | fnlwgt | education_num | capital_gain | capital_loss | hours_per_week | |
|---|---|---|---|---|---|---|
| count | 14653.00000 | 1.465300e+04 | 14653.000000 | 14653.000000 | 14653.000000 | 14653.000000 |
| mean | 38.70634 | 1.895570e+05 | 10.080803 | 946.968948 | 89.847267 | 40.410018 |
| std | 13.73178 | 1.051947e+05 | 2.565499 | 6657.572739 | 409.328688 | 12.470655 |
| min | 17.00000 | 1.349200e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 28.00000 | 1.176060e+05 | 9.000000 | 0.000000 | 0.000000 | 40.000000 |
| 50% | 37.00000 | 1.784160e+05 | 10.000000 | 0.000000 | 0.000000 | 40.000000 |
| 75% | 48.00000 | 2.377130e+05 | 12.000000 | 0.000000 | 0.000000 | 45.000000 |
| max | 90.00000 | 1.490400e+06 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 |
sns.histplot(data['income_level'], color = discre[0])
plt.show()
fig, axs = plt.subplots(6, 2, figsize=(20, 30))
i = 0
for col in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
sns.histplot(data=data,bins = 20, x=col, ax = axs[i, 0], color = discre[1])
sns.boxplot(data=data, x=col, ax = axs[i,1], color = discre[3])
i += 1
plt.show()
fig, axs = plt.subplots(6, figsize=(20, 30))
i = 0
for col in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
sns.kdeplot(data = data, x= col, shade=True, hue=data['income_level'], ax= axs[i], palette = [discre[4], discre[0]]) #color=
i += 1
plt.show()
All our previous observations have come true.
binary = lambda x: 1 if x == ">50K" else 0
data['income_level'] = data['income_level'].apply(binary)
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot=True, vmin=-1, cmap = contin)
plt.show()
We see that here also the highest correlation with income_level is with education_num. We also see a similar correlation for age, capital_gain and hours_per_week. The assumption of a negligible effect of fnlwgt also holds true.
categorical_columns = list(data.select_dtypes(exclude=[np.number]).columns)
categorical_columns = [c for c in categorical_columns if c != 'income_level']
r = 2
fig, axs = plt.subplots(8, figsize=(20,70))
i = 0
for col in categorical_columns:
sns.countplot(data=data, hue=data['income_level'], y=col, ax = axs[i], palette = [discre[4],discre[5]])
i += 1
plt.show()
fig, axs = plt.subplots(8, figsize=(20,60))
i = 0
for col in categorical_columns:
if i == 0:
data.groupby(col)["income_level"].value_counts(normalize=True).unstack('income_level').plot(kind = "bar", stacked=True, ax = axs[i], color = ["#FF6863", "pink"])
else:
data.groupby(col)["income_level"].value_counts(normalize=True).unstack('income_level').plot(kind = "bar", stacked=True, ax = axs[i], color = ["#FF6863", "pink"])
i += 1
plt.show()
Here, too, all our previous observations have come true.
We believe that the following variables are worth ignoring in later work:
fnlwgt - a statistical variable which does not introduce new relationships into the model, education_num - variable carrying the same information as the education columnrelationship - the variable carries information that is already covered by the variables marital_status and sex sex i race - variables that may exacerbate earnings differences using our model and are generally regarded as immoraldata = data_all
data.shape
(48842, 15)
data.drop(columns=['fnlwgt'], inplace=True)
data.drop(columns=['education_num'], inplace=True)
data.drop(columns=['relationship'], inplace=True)
data.drop(columns=['sex'], inplace=True)
data.drop(columns=['race'], inplace=True)
We decided to do the changes on a new frame so that there would be an opportunity to go back if necessary.
df = data
df.head()
| age | workclass | education | marital_status | occupation | capital_gain | capital_loss | hours_per_week | native_country | income_level | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | Bachelors | Never-married | Adm-clerical | 2174.0 | 0.0 | 40.0 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | 0.0 | 0.0 | 13.0 | United-States | <=50K |
| 2 | 38 | Private | HS-grad | Divorced | Handlers-cleaners | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 3 | 53 | Private | 11th | Married-civ-spouse | Handlers-cleaners | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 4 | 28 | Private | Bachelors | Married-civ-spouse | Prof-specialty | 0.0 | 0.0 | 40.0 | Cuba | <=50K |
Let's start with the education variable:
df.loc[data['education'] == "Preschool", 'education'] = "Dropout"
df.loc[data['education'] == "1st-4th", 'education'] = "Dropout"
df.loc[data['education'] == "5th-6th", 'education'] = "Dropout"
df.loc[data['education'] == "7th-8th", 'education'] = "Dropout"
df.loc[data['education'] == "9th", 'education'] = "Dropout"
df.loc[data['education'] == "10th", 'education'] = "Dropout"
df.loc[data['education'] == "11th", 'education'] = "Dropout"
df.loc[data['education'] == "12th", 'education'] = "Dropout"
df.loc[data['education'] == "Assoc-acdm", 'education'] = "Associates"
df.loc[data['education'] == "Assoc-voc", 'education'] = "Associates"
df.loc[data['education'] == "HS-grad", 'education'] = "HS/college"
df.loc[data['education'] == "Some-college", 'education'] = "HS/college"
We then considered it useful to restrict the set of values of the `marital-status' variable:
df.loc[data['marital_status'] == "Married-civ-spouse", 'marital_status'] = "Married"
df.loc[data['marital_status'] == "Married-spouse-absent", 'marital_status'] = "Married"
df.loc[data['marital_status'] == "Married-AF-spouse", 'marital_status'] = "Married"
And to restrict the workclass variable:
df.loc[data['workclass'] == "Self-emp-not-inc", 'workclass'] = "Self-emp"
df.loc[data['workclass'] == "Self-emp-inc", 'workclass'] = "Self-emp"
df.loc[data['workclass'] == "Federal-gov", 'workclass'] = "Gov"
df.loc[data['workclass'] == "Local-gov", 'workclass'] = "Gov"
df.loc[data['workclass'] == "State-gov", 'workclass'] = "Gov"
df.loc[data['workclass'] == "Without-pay", 'workclass'] = "No-gain"
df.loc[data['workclass'] == "Never-worked", 'workclass'] = "No-gain"
We group the variable native_country, due to the large number of countries we decided to divide them into continents:
replace_countries = {'United-States': 'NA',
'China': 'AS',
'Mexico': 'LA',
'Guatemala': 'LA',
'Cuba': 'LA',
'Jamaica': 'LA',
'South': 'AS',
'Puerto-Rico': 'LA',
'Honduras': 'LA',
'England': 'EU',
'Canada': 'NA',
'Germany': 'EU',
'Iran': 'AS',
'Philippines': 'AS',
'Poland': 'EU',
'Columbia': 'LA',
'Cambodia': 'AS',
'Thailand': 'AS',
'Ecuador': 'LA',
'Laos': 'AS',
'Taiwan': 'AS',
'Haiti': 'LA',
'Portugal': 'EU',
'Dominican-Republic': 'LA',
'El-Salvador': 'LA',
'France': 'EU',
'Italy': 'EU',
'India': 'AS',
'Japan': 'AS',
'Yugoslavia': 'EU',
'Peru': 'LA',
'Hong': 'LA',
'Ireland': 'EU',
'Trinadad&Tobago': 'LA',
'Greece': 'EU',
'Nicaragua': 'LA',
'Vietnam': 'AS',
'Outlying-US(Guam-USVI-etc)': 'NA',
'Scotland': 'EU',
'Hungary': 'EU',
'Holand-Netherlands': 'EU'}
df["native_country"] = df["native_country"].replace(replace_countries)
We also group the hours_per_week column by a value of 40 (full-time hours)
def zmienprace(row):
if row['hours_per_week'] < 40:
result = "less-40hours"
elif row['hours_per_week'] > 40:
result = "more-40hours"
else:
result = "40hours"
return result
df.hours_per_week = df.apply(zmienprace, axis = 1)
df.hours_per_week.value_counts()
40hours 22803 more-40hours 14352 less-40hours 11687 Name: hours_per_week, dtype: int64
Next is age:
def zmienwiek(row):
if row['age'] < 26:
result = "Young Adults"
elif row['age'] < 41:
result = "Middle Adults"
else:
result = "Elderly"
return result
df.age = df.apply(zmienwiek, axis = 1)
df.age.value_counts()
Elderly 20211 Middle Adults 19004 Young Adults 9627 Name: age, dtype: int64
We noticed that some categorical variables still had a large number of categories, so we decided to reduce them using WOE
def calculate_woe_iv(dataset, feature, target):
lst = []
for i in range(dataset[feature].nunique()):
val = list(dataset[feature].unique())[i]
lst.append({
'Value': val,
'All': dataset[dataset[feature] == val].count()[feature],
'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
})
dset = pd.DataFrame(lst)
dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
dset = dset.sort_values(by='WoE')
return dset
df["income_level"]=df["income_level"].map({"<=50K":0,">50K":1})
occupation
print('WoE'.format(col))
df2 = calculate_woe_iv(df, 'occupation', 'income_level')
print(df2)
WoE
Value All Good Bad Distr_Good Distr_Bad WoE
1 Exec-managerial 6086 3178 2908 0.085534 0.248823 -1.067835
3 Prof-specialty 8981 5932 3049 0.159655 0.260888 -0.491073
12 Armed-Forces 15 10 5 0.000269 0.000428 -0.463474
11 Protective-serv 983 675 308 0.018167 0.026354 -0.372008
10 Tech-support 1446 1026 420 0.027614 0.035937 -0.263453
5 Sales 5504 4029 1475 0.108438 0.126209 -0.151761
6 Craft-repair 6112 4729 1383 0.127278 0.118337 0.072837
7 Transport-moving 2355 1874 481 0.050437 0.041157 0.203342
0 Adm-clerical 5611 4843 768 0.130346 0.065714 0.684879
9 Machine-op-inspct 3022 2650 372 0.071323 0.031830 0.806800
8 Farming-fishing 1490 1317 173 0.035446 0.014803 0.873199
2 Handlers-cleaners 2072 1934 138 0.052052 0.011808 1.483471
4 Other-service 4923 4719 204 0.127008 0.017455 1.984611
13 Priv-house-serv 242 239 3 0.006433 0.000257 3.221230
Based on the results, we could combine categories that have no logical reason why they could be taken as one. Such as 'Machine-op-inspct' and 'Farming-fishing'. We can only merge them into one if these results are repeated on the validation and training sets.
data_train, data_val = train_test_split(df,train_size=0.7, stratify=data["income_level"], random_state=55)
print('WoE'.format(col))
df2 = calculate_woe_iv(data_train, 'occupation', 'income_level')
print(df2)
print('WoE'.format(col))
df2 = calculate_woe_iv(data_val, 'occupation', 'income_level')
print(df2)
WoE
Value All Good Bad Distr_Good Distr_Bad WoE
1 Exec-managerial 4247 2212 2035 0.085051 0.248747 -1.073189
13 Armed-Forces 13 8 5 0.000308 0.000611 -0.686586
8 Prof-specialty 6294 4172 2122 0.160412 0.259381 -0.480553
12 Protective-serv 690 479 211 0.018417 0.025791 -0.336747
11 Tech-support 1020 728 292 0.027991 0.035692 -0.243043
9 Sales 3894 2820 1074 0.108428 0.131280 -0.191243
3 Craft-repair 4256 3280 976 0.126115 0.119301 0.055546
10 Transport-moving 1604 1271 333 0.048870 0.040704 0.182827
4 Adm-clerical 3936 3403 533 0.130844 0.065151 0.697301
5 Machine-op-inspct 2112 1867 245 0.071786 0.029947 0.874240
7 Farming-fishing 1039 920 119 0.035374 0.014546 0.888660
0 Handlers-cleaners 1470 1370 100 0.052676 0.012223 1.460806
2 Other-service 3444 3311 133 0.127307 0.016257 2.058067
6 Priv-house-serv 170 167 3 0.006421 0.000367 2.862792
WoE
Value All Good Bad Distr_Good Distr_Bad WoE
5 Exec-managerial 1839 966 873 0.086660 0.249002 -1.055466
1 Prof-specialty 2687 1760 927 0.157890 0.264404 -0.515579
10 Protective-serv 293 196 97 0.017583 0.027667 -0.453291
11 Tech-support 426 298 128 0.026734 0.036509 -0.311631
2 Sales 1610 1209 401 0.108460 0.114375 -0.053107
12 Priv-house-serv 72 72 0 0.006459 0.000000 0.000000
13 Armed-Forces 2 2 0 0.000179 0.000000 0.000000
3 Craft-repair 1856 1449 407 0.129990 0.116087 0.113121
6 Transport-moving 751 603 148 0.054095 0.042213 0.248010
0 Adm-clerical 1675 1440 235 0.129183 0.067028 0.656118
7 Machine-op-inspct 910 783 127 0.070243 0.036224 0.662251
9 Farming-fishing 451 397 54 0.035615 0.015402 0.838258
8 Handlers-cleaners 602 564 38 0.050597 0.010839 1.540773
4 Other-service 1479 1408 71 0.126312 0.020251 1.830551
We combine categories with similar WoE:
df.loc[data['occupation'] == "Adm-clerical", 'occupation'] = "Adm-clerical/Machine-op-inspct/Farming-fishing"
df.loc[data['occupation'] == "Machine-op-inspct", 'occupation'] = "Adm-clerical/Machine-op-inspct/Farming-fishing"
df.loc[data['occupation'] == "Farming-fishing", 'occupation'] = "Adm-clerical/Machine-op-inspct/Farming-fishing"
df.loc[data['occupation'] == "Prof-specialty", 'occupation'] = "Prof-specialty/Protective-serv/Armed-Forces"
df.loc[data['occupation'] == "Protective-serv", 'occupation'] = "Prof-specialty/Protective-serv/Armed-Forces"
df.loc[data['occupation'] == "Armed-Forces", 'occupation'] = "Prof-specialty/Protective-serv/Armed-Forces"
df.loc[data['occupation'] == "Tech-support", 'occupation'] = "Tech-support/Sales"
df.loc[data['occupation'] == "Sales", 'occupation'] = "Tech-support/Sales"
df.loc[data['occupation'] == "Craft-repair", 'occupation'] = "Craft-repair/Transport-moving"
df.loc[data['occupation'] == "Transport-moving", 'occupation'] = "Craft-repair/Transport-moving"
df.loc[data['occupation'] == "Handlers-cleaners", 'occupation'] = "Handlers-cleaners/Other-service"
df.loc[data['occupation'] == "Other-service", 'occupation'] = "Handlers-cleaners/Other-service"
We will first deal with columns that have binary values:
df = pd.get_dummies(df, columns=["age", "workclass", "education", "marital_status", "occupation", "hours_per_week", "native_country"])
The variables capital_gain and capital_loss are being normalised
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["capital_gain"] = scaler.fit_transform(df[["capital_gain"]])
df["capital_loss"] = scaler.fit_transform(df[["capital_loss"]])
Let's see how our frame looks now:
df.head(10)
| capital_gain | capital_loss | income_level | age_Elderly | age_Middle Adults | age_Young Adults | workclass_Gov | workclass_No-gain | workclass_Private | workclass_Self-emp | ... | occupation_Priv-house-serv | occupation_Prof-specialty/Protective-serv/Armed-Forces | occupation_Tech-support/Sales | hours_per_week_40hours | hours_per_week_less-40hours | hours_per_week_more-40hours | native_country_AS | native_country_EU | native_country_LA | native_country_NA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.021740 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0.000000 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0.000000 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0.000000 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0.000000 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 5 | 0.000000 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 6 | 0.000000 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 7 | 0.000000 | 0.0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 8 | 0.140841 | 0.0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 9 | 0.051781 | 0.0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
10 rows × 36 columns
df.to_csv("./data/census_income_dataset-encoded.csv", index=False)
We believe that we should dispense with variables:
fnlwgt - statistical variable, not helpful for the model, education-num - we will use the corresponding variable education,relationship - carries information that overlaps with information from the variables marital_status and sex.
In order to prepare the modelling frame we decided to replace the variables age and hours_per_week with categorical variables.We noticed that it was useful to reduce the number of categories in some of the variables and to group them, so we did this where possible. We then encoded the columns using HotEncoding.
Due to the large discrepancy in the values of the capital_gain and capital_loss variables, we concluded that it was best to apply MinMaxScaler to them.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import plot_confusion_matrix
import xgboost
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("./data/census_income_dataset-encoded.csv")
data, data_val = train_test_split(data, test_size = 0.3)
#data = data.sample(frac =.2) # pracujemy na probce bo to sie strasznie dlugo renderuje
X = data.drop(['income_level'], axis = 1)
Y = np.array(data['income_level'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 123)
def featureimportance(model, importances, sth):
importances = importances.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(14,7))
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
if sth == "coeff":
plt.title('Feature importances obtained from coefficients', size=20)
else:
plt.title('Feature importances obtained from feature_importances', size=20)
plt.xticks(rotation='vertical')
return plt.show()
To choose the right model we will check the accuracy of nine of them.
def evaluate_model(model, X, y):
scores = cross_val_score(model, X, y, scoring='accuracy', cv=20, n_jobs=-1, error_score='raise')
return scores
models = dict()
models2 = dict()
names = list()
scores = list()
modelLR = LogisticRegression(random_state = 1)
scoreLR = evaluate_model(modelLR, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreLR)}")
modelLR.fit(X_train, Y_train)
print(f"Model score: {modelLR.score(X_test, Y_test)}")
models["Logistic Regression"] = np.mean(scoreLR)
names.append("LogisticReg")
scores.append(scoreLR)
Model accuracy (cross-validation): 0.8475248170162095 Model score: 0.8403041825095057
plot_confusion_matrix(modelLR, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelLR.coef_[0]
})
featureimportance(modelLR, importances, "coeff")
modelDT = DecisionTreeClassifier()
scoreDT = evaluate_model(modelDT, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreDT)}")
modelDT.fit(X_train, Y_train)
print(f"Model score: {modelDT.score(X_test, Y_test)}")
models["Decision Tree"] = np.mean(scoreDT)
names.append("DTree")
scores.append(scoreDT)
Model accuracy (cross-validation): 0.8541638864080427 Model score: 0.8469338013064249
plot_confusion_matrix(modelDT, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelDT.feature_importances_
})
featureimportance(modelDT, importances, "")
modelKN = KNeighborsClassifier()
scoreKN = evaluate_model(modelKN, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreKN)}")
modelKN.fit(X_train, Y_train)
print(f"Model score: {modelKN.score(X_test, Y_test)}")
models['KNeighbors'] = np.mean(scoreKN)
names.append("KNeighbors")
scores.append(scoreKN)
Model accuracy (cross-validation): 0.8265539164861637 Model score: 0.8214877644535439
plot_confusion_matrix(modelKN, X_test, Y_test)
plt.show()
modelRF = RandomForestClassifier()
scoreRF = evaluate_model(modelRF, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreRF)}")
modelRF.fit(X_train, Y_train)
print(f"Model score: {modelRF.score(X_test, Y_test)}")
models['Random Forest'] = np.mean(scoreRF)
names.append("RForest")
scores.append(scoreRF)
Model accuracy (cross-validation): 0.8524675180246305 Model score: 0.8484937116115824
plot_confusion_matrix(modelRF, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelRF.feature_importances_
})
featureimportance(modelRF, importances, "")
modelB = GaussianNB()
scoreB = evaluate_model(modelB, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreB)}")
modelB.fit(X_train, Y_train)
print(f"Model score: {modelB.score(X_test, Y_test)}")
models['Bayes'] = np.mean(scoreB)
names.append("Bayes")
scores.append(scoreB)
Model accuracy (cross-validation): 0.7907229698979259 Model score: 0.788632153651165
plot_confusion_matrix(modelB, X_test, Y_test)
plt.show()
modelSVC = SVC()
scoreSVC = evaluate_model(modelSVC, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreSVC)}")
modelSVC.fit(X_train, Y_train)
print(f"Model score: {modelSVC.score(X_test, Y_test)}")
models['SVC'] = np.mean(scoreSVC)
names.append("SVC")
scores.append(scoreSVC)
Model accuracy (cross-validation): 0.8346844192595786 Model score: 0.8299697767378376
plot_confusion_matrix(modelSVC, X_test, Y_test)
plt.show()
modelXGB = XGBClassifier(eval_metric='logloss')
scoreXGB = evaluate_model(modelXGB, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreXGB)}")
modelXGB.fit(X_train, Y_train)
print(f"Model score: {modelXGB.score(X_test, Y_test)}")
models['XGBoost'] = np.mean(scoreSVC)
names.append("XGB")
scores.append(scoreXGB)
Model accuracy (cross-validation): 0.8667119891595577 Model score: 0.8596080725358292
plot_confusion_matrix(modelXGB, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelXGB.feature_importances_
})
featureimportance(modelXGB, importances, "")
modelLGBM = LGBMClassifier()
scoreLGBM = evaluate_model(modelLGBM, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreLGBM)}")
modelLGBM.fit(X_train, Y_train)
print(f"Model score: {modelLGBM.score(X_test, Y_test)}")
models['LightGBM'] = np.mean(scoreLGBM)
names.append("LGBM")
scores.append(scoreLGBM)
Model accuracy (cross-validation): 0.8666827322841921 Model score: 0.8598030613239739
plot_confusion_matrix(modelLGBM, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelLGBM.feature_importances_
})
featureimportance(modelLGBM, importances, "")
modelAB = AdaBoostClassifier(random_state = 1)
scoreAB = evaluate_model(modelAB, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreAB)}")
modelAB.fit(X_train, Y_train)
print(f"Model score: {modelAB.score(X_test, Y_test)}")
models['AdaBoost'] = np.mean(scoreAB)
names.append("AdaB")
scores.append(scoreAB)
Model accuracy (cross-validation): 0.856913450976769 Model score: 0.8524909817685483
plot_confusion_matrix(modelAB, X_test, Y_test)
plt.show()
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': modelAB.feature_importances_
})
featureimportance(modelAB, importances, "")
comparision = pd.DataFrame(data = {"models" : models.keys(), "accuracy" : models.values()})
comparision.sort_values(by = "accuracy", ascending = False).reset_index(drop = True)
| models | accuracy | |
|---|---|---|
| 0 | LightGBM | 0.866683 |
| 1 | AdaBoost | 0.856913 |
| 2 | Decision Tree | 0.854164 |
| 3 | Random Forest | 0.852468 |
| 4 | Logistic Regression | 0.847525 |
| 5 | SVC | 0.834684 |
| 6 | XGBoost | 0.834684 |
| 7 | KNeighbors | 0.826554 |
| 8 | Bayes | 0.790723 |
plt.figure(figsize=(13,5))
plt.boxplot(scores, labels=names)
plt.show()
model = [modelLR, modelDT, modelKN, modelRF, modelB, modelSVC, modelXGB, modelLGBM, modelAB]
for i in range(len(model)):
model[i].fit(X_train, Y_train)
plt.figure(figsize=(14,8))
fig = plot_roc_curve(modelLR,X_test, Y_test)
for i in range(len(model) - 1):
fig = plot_roc_curve(model[i+1], X_test, Y_test, ax = fig.ax_)
fig = plt.title("ROC curve comparison")
plt.show()
<Figure size 1008x576 with 0 Axes>
We will now select the 4 models with the highest accuracy and combine them into collective models.
estimators1 = [("LightGBM", modelLGBM), ("AdaBoost", modelAB), ("Logistic Regression", modelLR), ("Random Forest", modelRF)]
estimators2 = [("LightGBM", modelLGBM), ("AdaBoost", modelAB), ("Random Forest", modelRF)]
modelHard = VotingClassifier(estimators=estimators1, voting='hard')
modelHard.fit(X_train,Y_train)
scoreHard = np.mean(evaluate_model(modelHard, X, Y))
print('Model score: ', modelHard.score(X_test,Y_test))
print('Model accuracy (cross-validation): ', scoreHard)
models['VotingHard'] = np.mean(scoreHard)
names.append("VHard")
scores.append(scoreHard)
Model score: 0.8557082967729356 Model accuracy (cross-validation): 0.8611546884570505
plot_confusion_matrix(modelHard, X_test, Y_test)
plt.show()
modelSoft = VotingClassifier(estimators=estimators1, voting='soft')
modelSoft.fit(X_train,Y_train)
scoreSoft = np.mean(evaluate_model(modelSoft, X, Y))
print('Model accuracy (cross-validation): ', scoreSoft)
print('Model score: ', modelSoft.score(X_test,Y_test))
models['VotingSoft'] = np.mean(scoreSoft)
names.append("VSoft")
scores.append(scoreSoft)
Model accuracy (cross-validation): 0.8628509712940435 Model score: 0.8579506678365993
plot_confusion_matrix(modelSoft, X_test, Y_test)
plt.show()
modelStack = StackingClassifier(estimators=estimators2, final_estimator=LogisticRegression())
scoreStack = np.mean(evaluate_model(modelStack, X, Y))
print('Model accuracy (cross-validation): ', scoreStack)
print('Model score: ', modelStack.fit(X_train, Y_train).score(X_test, Y_test))
models['Stacking'] = np.mean(scoreStack)
names.append("Stacking")
scores.append(scoreStack)
Model accuracy (cross-validation): 0.8662440502465449 Model score: 0.8587306229891781
plot_confusion_matrix(modelStack, X_test, Y_test)
plt.show()
comparision = pd.DataFrame(data = {"models" : models.keys(), "accuracy" : models.values()})
comparision.sort_values(by = "accuracy", ascending = False).reset_index(drop = True)
| models | accuracy | |
|---|---|---|
| 0 | LightGBM | 0.866683 |
| 1 | Stacking | 0.866244 |
| 2 | VotingSoft | 0.862851 |
| 3 | VotingHard | 0.861155 |
| 4 | AdaBoost | 0.856913 |
| 5 | Decision Tree | 0.854164 |
| 6 | Random Forest | 0.852468 |
| 7 | Logistic Regression | 0.847525 |
| 8 | SVC | 0.834684 |
| 9 | XGBoost | 0.834684 |
| 10 | KNeighbors | 0.826554 |
| 11 | Bayes | 0.790723 |
After a preliminary analysis of the models and their accuracy, we see that with the default parameters they turned out to be the best:
Considering the execution time, memory resources and operational complexity of each model, in the following we will focus only on the LighGBM model.
def gridSearch(model,grid, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
result = grid_search.fit(X, y)
print('Best score (gridSearch): %s' % result.best_score_)
print('Best hyperparameters: %s' % result.best_params_)
return
def randomSearch(model,grid, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
random_search = RandomizedSearchCV(estimator=model, param_distributions = grid, n_iter = 50, n_jobs=-1, cv=cv, scoring='accuracy', random_state=123)
result = random_search.fit(X, y)
print('Best score (randomSearch): %s' % result.best_score_)
print('Best hyperparameters: %s' % result.best_params_)
return
learning_rate = np.arange(0.1, 0.9, 0.1)
max_depth = range(1, 20, 5)
num_leaves = range(5, 100,5)
grid = dict(learning_rate = learning_rate, max_depth = max_depth, num_leaves = num_leaves)
gridSearch(modelLGBM, grid, X_train, Y_train)
Best score (gridSearch): 0.8692688737677411
Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 11, 'num_leaves': 10}
randomSearch(modelLGBM, grid, X_train, Y_train)
Best score (randomSearch): 0.8682519434119494
Best hyperparameters: {'num_leaves': 30, 'max_depth': 11, 'learning_rate': 0.1}
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, roc_auc_score
tunedLightGBM = LGBMClassifier(learning_rate = 0.2, max_depth = 11, num_leaves = 10)
scoreTunedLGBM = evaluate_model(tunedLightGBM, X, Y)
print(f"Model accuracy (cross-validation): {np.mean(scoreTunedLGBM)}")
tunedLightGBM.fit(X_train, Y_train)
y_pred = tunedLightGBM.predict(X_test)
predictions=tunedLightGBM.predict(X_test)
recall = recall_score(Y_test, predictions,average='micro')
precision = precision_score(Y_test, predictions,average='micro')
f1 = f1_score(Y_test, predictions, average='micro')
auc = roc_auc_score(Y_test,predictions)
print(f"Model score: {tunedLightGBM.score(X_test, Y_test)}")
print(f"Model recall: ", recall)
print(f"Model precision: ", precision)
print(f"Model f1:", f1)
print(f"Model auc:", auc)
Model accuracy (cross-validation): 0.8666829204863141 Model score: 0.8669201520912547 Model recall: 0.8669201520912547 Model precision: 0.8669201520912547 Model f1: 0.8669201520912546 Model auc: 0.7757942653450091
import dalex as dx
import shap
def shapley(model, X):
explainer = shap.Explainer(model, X)
shap_values = explainer(X)
shap.summary_plot(shap_values, X)
shap.plots.waterfall(shap_values[0])
shap.summary_plot(shap_values, X, plot_type="bar")
shapley(tunedLightGBM, X)
99%|===================| 33884/34189 [01:11<00:00]
exp = dx.Explainer(tunedLightGBM, X_train, Y_train)
mp = exp.model_parts()
mp
mp.plot()
Preparation of a new explainer is initiated -> data : 23932 rows 35 cols -> target variable : 23932 values -> model_class : lightgbm.sklearn.LGBMClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x0000019A6AB31280> will be used (default) -> predict function : Accepts pandas.DataFrame and numpy.ndarray. -> predicted values : min = 3.35e-05, mean = 0.24, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.979, mean = -3.74e-05, max = 0.999 -> model_info : package lightgbm A new explainer has been created!
salary = pd.DataFrame({'capital_gain': [0],
'capital_loss' : [0],
'age_Elderly': [1],
'age_Middle Adults': [1],
'age_Young Adults': [0],
'workclass_Gov': [1],
'workclass_No-gain': [1],
'workclass_Private': [1],
'workclass_Self-emp': [1],
'education_Associates': [1],
'education_Bachelors': [1],
'education_Doctorate': [1],
'education_Dropout': [1],
'education_HS/college': [1],
'education_Masters': [1],
'education_Prof-school': [1],
'marital_status_Divorced': [1],
'marital_status_Married': [0],
'marital_status_Never-married': [1],
'marital_status_Separated': [1],
'marital_status_Widowed': [1],
'occupation_Adm-clerical/Machine-op-inspct/Farming-fishing': [1],
'occupation_Craft-repair/Transport-moving': [1],
'occupation_Exec-managerial': [1],
'occupation_Handlers-cleaners/Other-service': [1],
'occupation_Priv-house-serv': [1],
'occupation_Prof-specialty/Protective-serv/Armed-Forces': [1],
'occupation_Tech-support/Sales': [1],
'hours_per_week_40hours': [1],
'hours_per_week_less-40hours': [1],
'hours_per_week_more-40hours': [1],
'native_country_AS': [1],
'native_country_EU': [1],
'native_country_LA': [1],
'native_country_NA': [1]},
index = ['income_level'])
rf_pparts = exp.predict_parts(new_observation = salary, type = "break_down")
rf_pparts.plot()
rf_pparts = exp.predict_parts(new_observation = salary, type = "shap")
rf_pparts.plot()
rf_pparts = exp.predict_surrogate(new_observation = salary)
# plot LIME
rf_pparts.show_in_notebook()